import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report,recall_score,precision_score,f1_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn import model_selection
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
sc = StandardScaler()
import seaborn as sns
import warnings
import numpy as np
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
df=pd.read_csv("C:/Users/HP/Downloads/parkinsons.data")
df.head()
df.describe().transpose()
df.drop('name',axis=1,inplace=True)
sns.pairplot(df,diag_kind='kde')
df.isnull().sum()
df.dtypes
df.skew(axis = 0, skipna = True)
import seaborn as sns
sns.pairplot(df,diag_kind='kde',hue='status')
corr = df.corr()
ax = sns.heatmap(
corr,
vmin=0, vmax=1,
cmap=sns.diverging_palette(20, 250, n=200),
square=True
)
ax.set_xticklabels(
ax.get_xticklabels(),
rotation=90,
horizontalalignment='right'
);
df.corr(method ='pearson')
X=df.drop('status',axis=1)
y=df.status
dt_model.fit(train_set, train_labels)
train_set, test_set, train_labels, test_labels = train_test_split(X, y, test_size=0.30, random_state=1)
dt_model = DecisionTreeClassifier(criterion = 'entropy',random_state=42,)
dt_model.fit(train_set, train_labels)
print("TrainingScore : {0} ".format(dt_model.score(train_set , train_labels)))
print("TestingScore : {0}".format(dt_model.score(test_set , test_labels)))
dt.feature_importances_
print(dt.predict(test_set))
dt.predict_proba(test_set)
from matplotlib import pyplot as plt
import numpy as np
neighbors = np.arange(1, 100)
train_accuracy_plot = np.empty(len(neighbors))
test_accuracy_plot = np.empty(len(neighbors))
# Loop over different values of k
for i, k in enumerate(neighbors):
train = []
test = []
for j in range(20):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=j)
sc=StandardScaler()
scaledX_train = sc.fit_transform(X_train)
scaledX_test = sc.transform(X_test)
dt = DecisionTreeClassifier(max_depth=k)
dt.fit(scaledX_train,y_train)
train.append(dt.score(scaledX_train,y_train))
test.append(dt.score(scaledX_test,y_test))
#Compute accuracy on the training set
train_accuracy_plot[i] = np.mean(train)
#Compute accuracy on the testing set
test_accuracy_plot[i] = np.mean(test)
# Generate plot
plt.title('Decision Tree: Varying Depth')
plt.plot(neighbors, test_accuracy_plot, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy_plot, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.show()
neighbors = np.arange(2, 100)
train_accuracy_plot = np.empty(len(neighbors))
test_accuracy_plot = np.empty(len(neighbors))
# Loop over different values of k
for i, k in enumerate(neighbors):
train = []
test = []
for j in range(20):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=j)
sc=StandardScaler()
scaledX_train = sc.fit_transform(X_train)
scaledX_test = sc.transform(X_test)
dt = DecisionTreeClassifier(min_samples_split=k)
dt.fit(scaledX_train,y_train)
train.append(dt.score(scaledX_train,y_train))
test.append(dt.score(scaledX_test,y_test))
#Compute accuracy on the training set
train_accuracy_plot[i] = np.mean(train)
#Compute accuracy on the testing set
test_accuracy_plot[i] = np.mean(test)
# Generate plot
plt.title('Decision Tree: Varying Depth')
plt.plot(neighbors, test_accuracy_plot, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy_plot, label = 'Training Accuracy')
plt.legend()
plt.xlabel('min_samples_split')
plt.ylabel('Accuracy')
plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=5)
sc=PowerTransformer()
scaledX_train = sc.fit_transform(X_train)
scaledX_test = sc.transform(X_test)
dt_model = DecisionTreeClassifier(criterion = 'entropy',random_state=1,min_samples_split=8,max_depth=50)
dt_model.fit(X_train, y_train)
print("TrainingScore : {0} ".format(dt_model.score(X_train , y_train)))
print("TestingScore : {0}".format(dt_model.score(X_test , y_test)))
from sklearn.ensemble import AdaBoostClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=5)
sc=PowerTransformer()
scaledX_train = sc.fit_transform(X_train)
scaledX_test = sc.transform(X_test)
abcl = AdaBoostClassifier( n_estimators= 100)
abcl = abcl.fit(X_train, y_train)
print("Training Score")
print(abcl.score(X_train , y_train))
print("Testing Score")
print(abcl.score(X_test , y_test))
from sklearn.ensemble import GradientBoostingClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=5)
sc=PowerTransformer()
scaledX_train = sc.fit_transform(X_train)
scaledX_test = sc.transform(X_test)
gbcl = GradientBoostingClassifier(n_estimators = 200, learning_rate = 0.05)
gbcl = gbcl.fit(X_train, y_train)
print("Training Score")
print(gbcl.score(X_train , y_train))
print("Testing Score")
print(gbcl.score(X_test , y_test))
estimators = np.arange(1, 300,20)
train_accuracy_plot = np.empty(len(neighbors))
test_accuracy_plot = np.empty(len(neighbors))
# Loop over different values of k
for i, k in enumerate(estimators):
train = []
test = []
for j in range(20):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=j)
sc=StandardScaler()
scaledX_train = sc.fit_transform(X_train)
scaledX_test = sc.transform(X_test)
rfcl = RandomForestClassifier(n_estimators =k,criterion='entropy')
rfcl.fit(scaledX_train,y_train)
train.append(rfcl.score(scaledX_train,y_train))
test.append(rfcl.score(scaledX_test,y_test))
#Compute accuracy on the training set
train_accuracy_plot[i] = np.mean(train)
#Compute accuracy on the testing set
test_accuracy_plot[i] = np.mean(test)
# Generate plot
plt.title('Random Forest: Optimal Number of Trees')
plt.plot(neighbors, test_accuracy_plot, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy_plot, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Trees')
plt.ylabel('Accuracy')
plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=3)
sc=StandardScaler()
scaledX_train = sc.fit_transform(X_train)
scaledX_test = sc.transform(X_test)
rfcl = RandomForestClassifier(n_estimators =200,criterion='entropy')
rfcl.fit(scaledX_train,y_train)
print("Training Score")
print((rfcl.score(scaledX_train,y_train)))
print("Testing Score")
print((rfcl.score(scaledX_test,y_test)))